import datetime as dt
import warnings
from datetime import timedelta
from pathlib import Path
import folium
import geopandas as gpd
import lightgbm as lgb
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
from branca.colormap import linear
from folium import plugins
from folium.features import DivIcon, GeoJson, GeoJsonTooltip
from folium.plugins import HeatMap, MarkerCluster
from IPython.display import display
from matplotlib.ticker import FuncFormatter
from shapely import wkt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
base = Path("../data/raw")
months = [f"{m:02d}" for m in range(1, 7)]
files = [base / f"yellow_tripdata_2025-{m}.parquet" for m in months]
data = pd.concat((pd.read_parquet(fp) for fp in files), ignore_index=True)
df = data.copy()
Dataset Overview¶
df.head()
| VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | store_and_fwd_flag | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | congestion_surcharge | Airport_fee | cbd_congestion_fee | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2025-01-01 00:18:38 | 2025-01-01 00:26:59 | 1.0 | 1.60 | 1.0 | N | 229 | 237 | 1 | 10.0 | 3.5 | 0.5 | 3.00 | 0.0 | 1.0 | 18.00 | 2.5 | 0.0 | 0.0 |
| 1 | 1 | 2025-01-01 00:32:40 | 2025-01-01 00:35:13 | 1.0 | 0.50 | 1.0 | N | 236 | 237 | 1 | 5.1 | 3.5 | 0.5 | 2.02 | 0.0 | 1.0 | 12.12 | 2.5 | 0.0 | 0.0 |
| 2 | 1 | 2025-01-01 00:44:04 | 2025-01-01 00:46:01 | 1.0 | 0.60 | 1.0 | N | 141 | 141 | 1 | 5.1 | 3.5 | 0.5 | 2.00 | 0.0 | 1.0 | 12.10 | 2.5 | 0.0 | 0.0 |
| 3 | 2 | 2025-01-01 00:14:27 | 2025-01-01 00:20:01 | 3.0 | 0.52 | 1.0 | N | 244 | 244 | 2 | 7.2 | 1.0 | 0.5 | 0.00 | 0.0 | 1.0 | 9.70 | 0.0 | 0.0 | 0.0 |
| 4 | 2 | 2025-01-01 00:21:34 | 2025-01-01 00:25:06 | 3.0 | 0.66 | 1.0 | N | 244 | 116 | 2 | 5.8 | 1.0 | 0.5 | 0.00 | 0.0 | 1.0 | 8.30 | 0.0 | 0.0 | 0.0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 24083384 entries, 0 to 24083383 Data columns (total 20 columns): # Column Dtype --- ------ ----- 0 VendorID int32 1 tpep_pickup_datetime datetime64[us] 2 tpep_dropoff_datetime datetime64[us] 3 passenger_count float64 4 trip_distance float64 5 RatecodeID float64 6 store_and_fwd_flag object 7 PULocationID int32 8 DOLocationID int32 9 payment_type int64 10 fare_amount float64 11 extra float64 12 mta_tax float64 13 tip_amount float64 14 tolls_amount float64 15 improvement_surcharge float64 16 total_amount float64 17 congestion_surcharge float64 18 Airport_fee float64 19 cbd_congestion_fee float64 dtypes: datetime64[us](2), float64(13), int32(3), int64(1), object(1) memory usage: 3.3+ GB
describe = df.describe(include="number")
display(describe.style.format("{:,.2f}"))
| VendorID | passenger_count | trip_distance | RatecodeID | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | congestion_surcharge | Airport_fee | cbd_congestion_fee | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 24,083,384.00 | 18,664,783.00 | 24,083,384.00 | 18,664,783.00 | 24,083,384.00 | 24,083,384.00 | 24,083,384.00 | 24,083,384.00 | 24,083,384.00 | 24,083,384.00 | 24,083,384.00 | 24,083,384.00 | 24,083,384.00 | 24,083,384.00 | 18,664,783.00 | 18,664,783.00 | 24,083,384.00 |
| mean | 1.84 | 1.29 | 6.83 | 2.48 | 162.22 | 161.78 | 0.95 | 17.89 | 1.22 | 0.48 | 2.86 | 0.48 | 0.96 | 26.36 | 2.21 | 0.14 | 0.53 |
| std | 0.61 | 0.74 | 630.57 | 11.57 | 65.98 | 70.21 | 0.74 | 191.09 | 1.85 | 0.14 | 3.89 | 2.07 | 0.28 | 191.56 | 0.93 | 0.51 | 0.36 |
| min | 1.00 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | 0.00 | -1,807.60 | -17.39 | -21.74 | -220.00 | -148.17 | -1.00 | -1,832.85 | -2.50 | -1.75 | -0.75 |
| 25% | 2.00 | 1.00 | 1.02 | 1.00 | 116.00 | 112.00 | 1.00 | 8.60 | 0.00 | 0.50 | 0.00 | 0.00 | 1.00 | 15.60 | 2.50 | 0.00 | 0.00 |
| 50% | 2.00 | 1.00 | 1.80 | 1.00 | 161.00 | 162.00 | 1.00 | 13.50 | 0.00 | 0.50 | 2.14 | 0.00 | 1.00 | 20.93 | 2.50 | 0.00 | 0.75 |
| 75% | 2.00 | 1.00 | 3.49 | 1.00 | 233.00 | 234.00 | 1.00 | 21.84 | 2.50 | 0.50 | 3.93 | 0.00 | 1.00 | 29.68 | 2.50 | 0.00 | 0.75 |
| max | 7.00 | 9.00 | 386,088.43 | 99.00 | 265.00 | 265.00 | 5.00 | 863,372.12 | 133.60 | 22.14 | 960.94 | 916.87 | 1.00 | 863,380.37 | 2.50 | 6.75 | 1.50 |
When I reviewed the yearly distribution, I noticed stray entries from 2007, 2009, and 2024, most likely artifacts or logging errors.
To stay aligned with the EDA’s 2025 focus, I restricted the dataset to that year, ensuring a clean and consistent foundation for further analysis.
df.duplicated().sum()
np.int64(0)
df.groupby(df["tpep_pickup_datetime"].dt.to_period("Y")).size()
tpep_pickup_datetime 2007 1 2009 2 2024 21 2025 24083360 Freq: Y-DEC, dtype: int64
df = df[df["tpep_pickup_datetime"].dt.to_period("Y") == "2025"]
Missing Values¶
I identified that about 22.5% of the data is missing in key fields such as passenger_count, RatecodeID, store_and_fwd_flag, congestion_surcharge, and Airport_fee.
The missingness is not isolated to one variable but occurs jointly, as shown by the heatmap correlations where these fields overlap completely.
The matrix visualization confirms that the same rows are consistently missing across multiple variables, indicating a systemic recording issue rather than random gaps.
This suggests that when one of these fields is missing, the others are also likely to be absent, meaning missingness should be handled as a linked block rather than feature by feature.
df.isnull().sum()
VendorID 0 tpep_pickup_datetime 0 tpep_dropoff_datetime 0 passenger_count 5418601 trip_distance 0 RatecodeID 5418601 store_and_fwd_flag 5418601 PULocationID 0 DOLocationID 0 payment_type 0 fare_amount 0 extra 0 mta_tax 0 tip_amount 0 tolls_amount 0 improvement_surcharge 0 total_amount 0 congestion_surcharge 5418601 Airport_fee 5418601 cbd_congestion_fee 0 dtype: int64
100 * df.isna().mean().sort_values(ascending=False)
passenger_count 22.499356 congestion_surcharge 22.499356 store_and_fwd_flag 22.499356 RatecodeID 22.499356 Airport_fee 22.499356 tpep_dropoff_datetime 0.000000 VendorID 0.000000 tpep_pickup_datetime 0.000000 DOLocationID 0.000000 payment_type 0.000000 trip_distance 0.000000 PULocationID 0.000000 extra 0.000000 fare_amount 0.000000 mta_tax 0.000000 tip_amount 0.000000 improvement_surcharge 0.000000 tolls_amount 0.000000 total_amount 0.000000 cbd_congestion_fee 0.000000 dtype: float64
msno.matrix(df.sample(10000))
msno.heatmap(df)
<Axes: >
When I examined the distribution of missing passenger counts by VendorID, I found clear differences in reporting practices: Vendor 1 shows about 16% missing, Vendor 2 rises to 24%, and Vendor 6 records 100% missing.
Although I also looked at the monthly breakdown, the pattern of missing values appears broadly spread across months without a strong concentration in any specific period.
This suggests that the missingness can be linked to vendor-level reporting behavior rather than seasonal or monthly effects, making vendor practices the more likely source of these gaps.
df.groupby("VendorID")["passenger_count"].apply(lambda g: g.isna().mean() * 100)
VendorID 1 16.046088 2 24.361130 6 100.000000 7 0.000000 Name: passenger_count, dtype: float64
missing_rows = df[df.isna().any(axis=1)]
missing_rows.groupby(missing_rows["tpep_pickup_datetime"].dt.month).size()
tpep_pickup_datetime 1 540149 2 806937 3 916663 4 745730 5 1196176 6 1212946 dtype: int64
By comparing fares, I noticed that trips with missing passenger counts tend to have slightly lower average fares, but the difference is not dramatic.
When I broke down the missing rows by day of month and hour of day, the distribution appeared relatively uniform across time, without clear concentration in specific days or hours.
This suggests that the missingness is not strongly tied to temporal patterns, but is more likely linked to vendor-level reporting practices observed earlier.
na_subset = df[df["passenger_count"].isna()]
non_na_subset = df[df["passenger_count"].notna()]
print("Average fare (NA subset):", na_subset["fare_amount"].mean())
print("Average fare (non-NA subset):", non_na_subset["fare_amount"].mean())
Average fare (NA subset): 16.592194638431582 Average fare (non-NA subset): 18.260765692715346
missing_rows = df[df.isna().any(axis=1)]
missing_rows.groupby(missing_rows["tpep_pickup_datetime"].dt.day).size()
tpep_pickup_datetime 1 195375 2 146888 3 135746 4 153790 5 185432 6 177710 7 173328 8 200369 9 185485 10 150436 11 183326 12 179856 13 176788 14 206437 15 208388 16 182742 17 175968 18 179065 19 182377 20 192604 21 192363 22 210060 23 179340 24 162689 25 172771 26 169004 27 167090 28 180836 29 158386 30 154603 31 99349 dtype: int64
missing_rows.groupby(missing_rows["tpep_pickup_datetime"].dt.hour).size()
tpep_pickup_datetime 0 271412 1 180432 2 120160 3 86197 4 80294 5 75445 6 125586 7 203861 8 261885 9 192671 10 147380 11 149188 12 166977 13 183272 14 198075 15 211472 16 226430 17 324585 18 400518 19 347685 20 319489 21 360455 22 413182 23 371950 dtype: int64
Target Variable - Fare¶
The distribution of total fares reveals a right skew.
Most fares fall in the $15–30 range, with a median of $20.93.
However, the summary statistics expose extreme anomalies: a minimum of -$1,832.85 and a maximum exceeding $863k.
These values are unrealistic and reflect data errors or outliers.
desc = (
df["total_amount"]
.describe(percentiles=[0.25, 0.50, 0.75, 0.99])
.apply(lambda x: f"{x:,.2f}")
)
print(desc)
count 24,083,360.00 mean 26.36 std 191.56 min -1,832.85 25% 15.60 50% 20.93 75% 29.68 99% 103.86 max 863,380.37 Name: total_amount, dtype: object
plt.figure(figsize=(8, 5))
df["total_amount"].plot(
kind="hist",
bins=100,
range=(0, 100),
color="skyblue",
edgecolor="black",
)
plt.xlabel("Total Fare ($)")
plt.ylabel("Number of Fares (M)")
plt.title("Distribution of Total Fare Amounts (0–100 $)")
plt.show()
Ride Counts¶
The monthly distribution of NYC taxi rides for first half of 2025 shows a steady volume of rides between 3.5M and 4.6M per month.
While January and February record the lowest activity (around 3.5M rides), numbers climbs steadily through the spring, peaking in May (~4.6M rides).
This pattern can be originated from seasonal effects, with demand increasing as weather improves and tourism rises.
rides_per_month = df.groupby(df["tpep_pickup_datetime"].dt.to_period("M")).size()
ax = rides_per_month.plot(kind="bar", figsize=(8, 4), colormap="coolwarm")
ax.set_xticks(range(len(rides_per_month)))
ax.set_xticklabels(["January", "February", "March", "April", "May", "June"])
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{x/1e6:.1f}"))
ax.set_xlabel("Month")
ax.set_ylabel("Number of Taxi Rides (millions)")
ax.set_title("Taxi Rides per Month - 2025")
plt.show()
The weekly breakdown of NYC taxi rides in early 2016 shows fairly steady volumes, usually between 0.9M and 1.3M rides per week.
In many months, the last week shows higher ride counts, especially in January, March, May, and June.
This could be related to the binning method: the fourth week groups days 21–31, so it includes more days than the first three bins.
Because of this, the higher counts in the last week may come partly from the grouping choice rather than actual changes in demand.
dt = df["tpep_pickup_datetime"]
ride_dates = pd.DataFrame({"year": dt.dt.year, "month": dt.dt.month, "day": dt.dt.day})
ride_dates["week_in_month"] = pd.cut(
ride_dates["day"], bins=[0, 7, 14, 21, 31], labels=[1, 2, 3, 4], include_lowest=True
).astype(int)
rides = (
ride_dates.groupby(["year", "month", "week_in_month"])
.size()
.reset_index(name="rides")
)
pivot = rides.pivot_table(
index="month", columns="week_in_month", values="rides", aggfunc="sum", fill_value=0
)
ax = pivot.plot(kind="bar", figsize=(10, 5), colormap="tab10")
ax.set_xlabel("Month")
ax.set_ylabel("Number of Taxi Rides")
ax.set_title("Taxi Rides per Week (Grouped by Month, 2016)")
ax.set_xticklabels(["January", "February", "March", "April", "May", "June"], rotation=0)
plt.legend(title="Week in Month", labels=["1", "2", "3", "4"])
plt.show()
The average number of NYC taxi rides by hour of day shows a clear daily cycle.
Activity is lowest between 03:00–05:00 with under 1,500 rides/hour, then rises sharply after 06:00.
Ride counts climb steadily through the day, peaking around 18:00 with nearly 10,000 rides/hour.
Evening demand remains high until about 22:00 before tapering off toward midnight.
This pattern highlights strong rush hour and evening activity compared to the early-morning low point.
hourly_counts = df.groupby(df["tpep_pickup_datetime"].dt.floor("h")).size()
avg_by_hour = hourly_counts.groupby(hourly_counts.index.hour).mean()
ax = avg_by_hour.plot(kind="bar", figsize=(9, 4))
ax.set_xlabel("Hour of Day")
ax.set_ylabel("Avg Rides per Hour")
ax.set_title("Average Taxi Rides by Hour of Day")
# Format labels as HH:00
ax.set_xticks(range(24))
ax.set_xticklabels([f"{h:02d}:00" for h in range(24)], rotation=90)
plt.show()
From January to June 2025, the number of daily NYC taxi rides shows a clear upward trend.
January–February stay mostly between 80k and 140k rides, while spring months like March and April move higher, often above 140k.
By May and June, ride counts frequently reach 160k–180k, marking the busiest period in this range.
Overall, the first half of 2025 shows a steady rise in demand, with stronger activity in the late spring compared to the winter months.
daily = (
df.set_index("tpep_pickup_datetime")
.resample("D")
.size()
.asfreq("D", fill_value=0)
.rename("rides")
)
fig, ax = plt.subplots(figsize=(14, 6))
for m in range(1, 7):
s = daily.loc[f"2025-{m:02d}"]
ax.plot(s.index, s.values, lw=2, label=pd.Timestamp(2025, m, 1).strftime("%B 2025"))
ax.set_title("Daily NYC Yellow Taxi Rides — Jan–Jun 2025")
ax.set_xlabel("Date")
ax.set_ylabel("Rides")
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f"{int(x/1000)}k"))
ax.set_xlim(pd.Timestamp("2025-01-01"), pd.Timestamp("2025-06-30"))
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
ax.xaxis.set_minor_locator(mdates.DayLocator(interval=5)) # fewer day ticks for clarity
ax.grid(True, axis="y", alpha=0.25)
ax.legend(ncol=3, frameon=False, loc="upper left")
for spine in ("top", "right"):
ax.spines[spine].set_visible(False)
plt.tight_layout()
plt.show()
Trip Distance¶
The trip distance distribution is heavily right-skewed.
Most rides are short: the median is 1.8 miles, and 75% of trips are under 3.5 miles.
Even the 99th percentile stays below 20 miles, showing that nearly all trips are within city limits.
At the same time, extreme outliers exist, with a maximum recorded distance of over 386k miles, which is not realistic and points to data errors.
On the log-scaled histogram, the ride counts drop steeply as distance increases, highlighting how concentrated the data is around very short trips.
desc = (
df["trip_distance"]
.describe(percentiles=[0.25, 0.50, 0.75, 0.99])
.apply(lambda x: f"{x:,.2f}")
)
print(desc)
count 24,083,360.00 mean 6.83 std 630.57 min 0.00 25% 1.02 50% 1.80 75% 3.49 99% 19.42 max 386,088.43 Name: trip_distance, dtype: object
# Plot histogram
plt.figure(figsize=(8, 5))
df["trip_distance"].plot(
kind="hist", bins=50, range=(0, 100), color="skyblue", edgecolor="black", log=True
)
plt.xlabel("Trip Distance (miles)")
plt.ylabel("Number of Rides")
plt.title("Distribution of Taxi Ride Distances (0–100 miles)")
plt.show()
The scatter plot of fare amount versus trip distance (0–20 miles) shows a clear positive relationship: as distance increases, fares generally rise.
Most points form a dense upward band, matching expected fare growth with distance.
At the same time, several unusual patterns appear. There are negative fares, which are not realistic, and flat horizontal lines at fixed values, suggesting data entry or system artifacts.
Some points also sit far below or above the main band, indicating outliers where fare and distance are inconsistent.
Overall, while the main trend is sensible, the presence of anomalies suggests that the raw data includes errors or special cases that need attention.
plt.figure(figsize=(8, 5))
data = df.sample(n=200_000, random_state=42)
plt.figure(figsize=(9, 5))
plt.scatter(data["trip_distance"], data["fare_amount"], s=1)
plt.xlim(0, 20)
plt.ylim(-250, 250)
plt.xlabel("Trip distance (miles)")
plt.ylabel("Fare amount ($)")
plt.grid(True, which="both", alpha=0.3)
plt.tight_layout()
plt.show()
<Figure size 800x500 with 0 Axes>
Trip Duration¶
The trip duration distribution is strongly right-skewed.
Most rides last under 60 minutes, with the highest concentration in the first 10–30 minutes.
Beyond this range, counts drop off quickly, though the log scale shows a long tail extending up to 600 minutes in the plotted window.
Durations this long are unusual for NYC taxi rides and could be due to data entry errors, recording gaps, or rare edge cases.
The overall shape mirrors that of trip distance: a large cluster of short trips with a small fraction of extremely long outliers.
df["trip_duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
df["trip_duration"] = df["trip_duration"].dt.total_seconds() / 60
plt.figure(figsize=(8, 5))
df["trip_duration"].plot(
kind="hist", bins=100, range=(0, 600), color="skyblue", edgecolor="black", log=True
)
plt.xlabel("Trip Duration (0-600 Minutes)")
plt.ylabel("Number of Rides")
plt.title("Distribution of Taxi Ride Durations (Minutes)")
plt.show()
Passenger Count¶
The passenger count distribution shows that the vast majority of trips are single riders: 1 passenger accounts for over 14.8M rides.
Two-passenger trips are the next most common at about 2.6M, while counts for 3–6 passengers decrease steadily.
A smaller number of trips report 0 passengers (~139k), which is not realistic and likely due to data entry issues.
Very few rides list 7–9 passengers, with only a handful of entries, suggesting these are either rare cases or erroneous values.
Overall, the data confirms that NYC yellow taxis are primarily used by individual riders or small groups.
df["passenger_count"] = df["passenger_count"].astype("Int64")
df["passenger_count"].value_counts()
passenger_count 1 14801253 2 2583176 3 590567 4 385702 0 138981 5 104553 6 60443 8 52 9 16 7 16 Name: count, dtype: Int64
lo = int(df["passenger_count"].min())
hi = int(df["passenger_count"].max())
bins = np.arange(lo - 0.5, hi + 1.5, 1)
plt.figure(figsize=(8, 4))
plt.hist(df["passenger_count"], bins=bins, rwidth=0.9, log=True)
plt.xticks(range(lo, hi + 1))
plt.title("Passenger count")
plt.xlabel("Passengers")
plt.ylabel("Trips")
plt.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
Ride & Payment Types¶
The distribution of Rate Codes shows that nearly all rides use the Standard rate, which accounts for about 90% of trips.
Among the non-standard codes, the most common are JFK flat fare (~3.4%), Negotiated fare (~1.0%), and Null/Unknown (~1.4%).
Other categories such as Newark, Nassau/Westchester, and Group rides make up only a very small share of the data.
This confirms that the dataset is dominated by standard city trips, while airport transfers and special fares form a much smaller fraction.
df.RatecodeID.value_counts()
RatecodeID 1.0 17440651 2.0 650667 99.0 263813 5.0 193836 3.0 64341 4.0 51425 6.0 26 Name: count, dtype: int64
rate_code_dict = {
1: "Standard rate",
2: "JFK",
3: "Newark",
4: "Nassau or Westchester",
5: "Negotiated fare",
6: "Group ride",
99: "Null/unknown",
}
counts = df.groupby("RatecodeID").size()
total = counts.sum()
counts_other = counts.drop(1).sum()
labels1 = ["Standard rate", "Other rates"]
sizes1 = [counts[1] / total * 100, counts_other / total * 100]
counts_nonstd = counts.drop(1)
labels2 = [rate_code_dict.get(i, str(i)) for i in counts_nonstd.index]
sizes2 = counts_nonstd / total * 100
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].bar(labels1, sizes1, color=["skyblue", "orange"], edgecolor="black")
axes[0].set_ylabel("Percentage of Rides (%)")
axes[0].set_title("Standard vs Other Rate Codes")
axes[1].bar(labels2, sizes2, color="coral", edgecolor="black")
axes[1].set_ylabel("Percentage of Rides (%)")
axes[1].set_title("Breakdown of Non-Standard Rate Codes")
axes[1].set_xticklabels(labels2, rotation=45, ha="right")
plt.tight_layout()
plt.show()
The payment type distribution shows that Credit card is the dominant method, used in about 63% of rides.
Flex Fare (about 22%) and Cash (about 9%) are the next most common.
Other categories such as No charge (~0.6%), Dispute (~2.2%), and Unknown are very rare.
This indicates that most NYC taxi transactions are electronic payments, while cash and exceptional cases form a much smaller share.
df.payment_type.value_counts()
payment_type 1 15609069 0 5418601 2 2353584 4 544216 3 157887 5 3 Name: count, dtype: int64
payment_dict = {
0: "Flex Fare",
1: "Credit card",
2: "Cash",
3: "No charge",
4: "Dispute",
5: "Unknown",
6: "Voided trip",
}
counts = df.groupby("payment_type").size()
percents = counts / counts.sum() * 100
ax = percents.plot(kind="bar", figsize=(9, 4), color="seagreen", edgecolor="black")
ax.set_xticks(range(len(percents)))
ax.set_xticklabels(
[payment_dict.get(i, str(i)) for i in percents.index], rotation=45, ha="right"
)
ax.set_ylabel("Percentage of Rides (%)")
ax.set_xlabel("Payment Type")
ax.set_title("Distribution of Taxi Rides by Payment Type")
plt.tight_layout()
plt.show()
Geographical¶
df.head()
| VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | store_and_fwd_flag | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | congestion_surcharge | Airport_fee | cbd_congestion_fee | trip_duration | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2025-01-01 00:18:38 | 2025-01-01 00:26:59 | 1 | 1.60 | 1.0 | N | 229 | 237 | 1 | 10.0 | 3.5 | 0.5 | 3.00 | 0.0 | 1.0 | 18.00 | 2.5 | 0.0 | 0.0 | 8.350000 |
| 1 | 1 | 2025-01-01 00:32:40 | 2025-01-01 00:35:13 | 1 | 0.50 | 1.0 | N | 236 | 237 | 1 | 5.1 | 3.5 | 0.5 | 2.02 | 0.0 | 1.0 | 12.12 | 2.5 | 0.0 | 0.0 | 2.550000 |
| 2 | 1 | 2025-01-01 00:44:04 | 2025-01-01 00:46:01 | 1 | 0.60 | 1.0 | N | 141 | 141 | 1 | 5.1 | 3.5 | 0.5 | 2.00 | 0.0 | 1.0 | 12.10 | 2.5 | 0.0 | 0.0 | 1.950000 |
| 3 | 2 | 2025-01-01 00:14:27 | 2025-01-01 00:20:01 | 3 | 0.52 | 1.0 | N | 244 | 244 | 2 | 7.2 | 1.0 | 0.5 | 0.00 | 0.0 | 1.0 | 9.70 | 0.0 | 0.0 | 0.0 | 5.566667 |
| 4 | 2 | 2025-01-01 00:21:34 | 2025-01-01 00:25:06 | 3 | 0.66 | 1.0 | N | 244 | 116 | 2 | 5.8 | 1.0 | 0.5 | 0.00 | 0.0 | 1.0 | 8.30 | 0.0 | 0.0 | 0.0 | 3.533333 |
zones = gpd.read_file("../data/raw/shape_data/taxi_zones.shp")
zones.head()
| OBJECTID | Shape_Leng | Shape_Area | zone | LocationID | borough | geometry | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.116357 | 0.000782 | Newark Airport | 1 | EWR | POLYGON ((933100.918 192536.086, 933091.011 19... |
| 1 | 2 | 0.433470 | 0.004866 | Jamaica Bay | 2 | Queens | MULTIPOLYGON (((1033269.244 172126.008, 103343... |
| 2 | 3 | 0.084341 | 0.000314 | Allerton/Pelham Gardens | 3 | Bronx | POLYGON ((1026308.77 256767.698, 1026495.593 2... |
| 3 | 4 | 0.043567 | 0.000112 | Alphabet City | 4 | Manhattan | POLYGON ((992073.467 203714.076, 992068.667 20... |
| 4 | 5 | 0.092146 | 0.000498 | Arden Heights | 5 | Staten Island | POLYGON ((935843.31 144283.336, 936046.565 144... |
zones.info()
<class 'geopandas.geodataframe.GeoDataFrame'> RangeIndex: 263 entries, 0 to 262 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 OBJECTID 263 non-null int32 1 Shape_Leng 263 non-null float64 2 Shape_Area 263 non-null float64 3 zone 263 non-null object 4 LocationID 263 non-null int32 5 borough 263 non-null object 6 geometry 263 non-null geometry dtypes: float64(2), geometry(1), int32(2), object(2) memory usage: 12.5+ KB
zones.isnull().sum()
OBJECTID 0 Shape_Leng 0 Shape_Area 0 zone 0 LocationID 0 borough 0 geometry 0 dtype: int64
zones.duplicated().sum()
np.int64(0)
zones.crs
<Projected CRS: EPSG:2263> Name: NAD83 / New York Long Island (ftUS) Axis Info [cartesian]: - X[east]: Easting (US survey foot) - Y[north]: Northing (US survey foot) Area of Use: - name: United States (USA) - New York - counties of Bronx; Kings; Nassau; New York; Queens; Richmond; Suffolk. - bounds: (-74.26, 40.47, -71.8, 41.3) Coordinate Operation: - name: SPCS83 New York Long Island zone (US survey foot) - method: Lambert Conic Conformal (2SP) Datum: North American Datum 1983 - Ellipsoid: GRS 1980 - Prime Meridian: Greenwich
cent_proj = zones.to_crs(2263).geometry.centroid
cent_wgs = gpd.GeoSeries(cent_proj, crs=2263).to_crs(4326)
zones["lon"] = cent_wgs.x
zones["lat"] = cent_wgs.y
zones = zones.to_crs(4326)
zones.head()
| OBJECTID | Shape_Leng | Shape_Area | zone | LocationID | borough | geometry | lon | lat | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.116357 | 0.000782 | Newark Airport | 1 | EWR | POLYGON ((-74.18445 40.695, -74.18449 40.6951,... | -74.174002 | 40.691830 |
| 1 | 2 | 0.433470 | 0.004866 | Jamaica Bay | 2 | Queens | MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ... | -73.831300 | 40.616746 |
| 2 | 3 | 0.084341 | 0.000314 | Allerton/Pelham Gardens | 3 | Bronx | POLYGON ((-73.84793 40.87134, -73.84725 40.870... | -73.847422 | 40.864474 |
| 3 | 4 | 0.043567 | 0.000112 | Alphabet City | 4 | Manhattan | POLYGON ((-73.97177 40.72582, -73.97179 40.725... | -73.976968 | 40.723752 |
| 4 | 5 | 0.092146 | 0.000498 | Arden Heights | 5 | Staten Island | POLYGON ((-74.17422 40.56257, -74.17349 40.562... | -74.188485 | 40.552659 |
for column in ["zone", "LocationID", "borough"]:
print(zones[column].value_counts(), "\n")
zone
Governor's Island/Ellis Island/Liberty Island 3
Corona 2
Two Bridges/Seward Park 1
Belmont 1
West Farms/Bronx River 1
..
Bayside 1
Bedford 1
Bedford Park 1
Bellerose 1
Alphabet City 1
Name: count, Length: 260, dtype: int64
LocationID
103 3
56 2
232 1
20 1
248 1
..
16 1
17 1
18 1
19 1
4 1
Name: count, Length: 260, dtype: int64
borough
Queens 69
Manhattan 69
Brooklyn 61
Bronx 43
Staten Island 20
EWR 1
Name: count, dtype: int64
The zone polygons contain duplicates for both the name and the LocationID (e.g., Corona — 56 appears twice; Governor’s Island/Ellis/Liberty — 103 appears multiple times).
This likely comes from multi-parts or overlapping slivers in the source shapefile.
Applying buffer(0) cleans invalid or self-intersecting geometries, and dissolve(by="LocationID") merges all parts of the same zone into a single geometry.
Attribute conflicts are handled with a simple mode_or_first rule so the zone and borough labels stay consistent after merging.
The CRS is set to EPSG:2263 for accurate geometry work, then centroids are computed and converted to EPSG:4326 to add lon/lat columns.
The resulting table shows one row per LocationID with cleaned geometry and usable coordinates for joins and mapping.
zones[zones.duplicated(subset=["zone"], keep=False)]
zones[zones.duplicated(subset=["LocationID"], keep=False)]
| OBJECTID | Shape_Leng | Shape_Area | zone | LocationID | borough | geometry | lon | lat | |
|---|---|---|---|---|---|---|---|---|---|
| 55 | 56 | 0.056848 | 0.000181 | Corona | 56 | Queens | POLYGON ((-73.85361 40.7475, -73.85325 40.7468... | -73.858845 | 40.741407 |
| 56 | 57 | 0.019271 | 0.000018 | Corona | 56 | Queens | POLYGON ((-73.85131 40.74984, -73.85443 40.748... | -73.853582 | 40.751819 |
| 102 | 103 | 0.014306 | 0.000006 | Governor's Island/Ellis Island/Liberty Island | 103 | Manhattan | POLYGON ((-74.04389 40.69018, -74.04351 40.689... | -74.045288 | 40.689860 |
| 103 | 104 | 0.021221 | 0.000012 | Governor's Island/Ellis Island/Liberty Island | 103 | Manhattan | POLYGON ((-74.03995 40.70089, -74.03945 40.700... | -74.040771 | 40.698769 |
| 104 | 105 | 0.077425 | 0.000369 | Governor's Island/Ellis Island/Liberty Island | 103 | Manhattan | POLYGON ((-74.01675 40.69334, -74.0154 40.6930... | -74.019073 | 40.688784 |
zones["geometry"] = zones.geometry.buffer(0)
def mode_or_first(s: pd.Series):
m = s.mode(dropna=True)
return (
m.iloc[0] if not m.empty else (s.dropna().iloc[0] if s.notna().any() else None)
)
agg = {"zone": mode_or_first, "borough": mode_or_first}
zones = zones.dissolve(by="LocationID", aggfunc=agg).reset_index()
if zones.crs is None or zones.crs.to_epsg() != 2263:
zones = zones.to_crs(2263)
centers_2263 = zones.geometry.centroid
centers_wgs84 = centers_2263.to_crs(4326)
zones["lon"] = centers_wgs84.x
zones["lat"] = centers_wgs84.y
zones.head()
| LocationID | geometry | zone | borough | lon | lat | |
|---|---|---|---|---|---|---|
| 0 | 1 | POLYGON ((933100.918 192536.086, 933091.011 19... | Newark Airport | EWR | -74.174002 | 40.691830 |
| 1 | 2 | MULTIPOLYGON (((1020447.262 151454.148, 102035... | Jamaica Bay | Queens | -73.831300 | 40.616746 |
| 2 | 3 | POLYGON ((1026308.77 256767.698, 1026495.593 2... | Allerton/Pelham Gardens | Bronx | -73.847422 | 40.864474 |
| 3 | 4 | POLYGON ((992073.467 203714.076, 992068.667 20... | Alphabet City | Manhattan | -73.976968 | 40.723752 |
| 4 | 5 | POLYGON ((935843.31 144283.336, 936046.565 144... | Arden Heights | Staten Island | -74.188485 | 40.552659 |
df.head()
| VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | RatecodeID | store_and_fwd_flag | PULocationID | DOLocationID | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | congestion_surcharge | Airport_fee | cbd_congestion_fee | trip_duration | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2025-01-01 00:18:38 | 2025-01-01 00:26:59 | 1 | 1.60 | 1.0 | N | 229 | 237 | 1 | 10.0 | 3.5 | 0.5 | 3.00 | 0.0 | 1.0 | 18.00 | 2.5 | 0.0 | 0.0 | 8.350000 |
| 1 | 1 | 2025-01-01 00:32:40 | 2025-01-01 00:35:13 | 1 | 0.50 | 1.0 | N | 236 | 237 | 1 | 5.1 | 3.5 | 0.5 | 2.02 | 0.0 | 1.0 | 12.12 | 2.5 | 0.0 | 0.0 | 2.550000 |
| 2 | 1 | 2025-01-01 00:44:04 | 2025-01-01 00:46:01 | 1 | 0.60 | 1.0 | N | 141 | 141 | 1 | 5.1 | 3.5 | 0.5 | 2.00 | 0.0 | 1.0 | 12.10 | 2.5 | 0.0 | 0.0 | 1.950000 |
| 3 | 2 | 2025-01-01 00:14:27 | 2025-01-01 00:20:01 | 3 | 0.52 | 1.0 | N | 244 | 244 | 2 | 7.2 | 1.0 | 0.5 | 0.00 | 0.0 | 1.0 | 9.70 | 0.0 | 0.0 | 0.0 | 5.566667 |
| 4 | 2 | 2025-01-01 00:21:34 | 2025-01-01 00:25:06 | 3 | 0.66 | 1.0 | N | 244 | 116 | 2 | 5.8 | 1.0 | 0.5 | 0.00 | 0.0 | 1.0 | 8.30 | 0.0 | 0.0 | 0.0 | 3.533333 |
This cell builds an interactive Folium map of NYC taxi zones with two choropleths: one for Total rides and one for Avg total fare.
Ride aggregation is done by PULocationID (count) with a companion average total fare; results are merged to zones on LocationID.
Layer control allows switching between the two themes; with both visible, colors can mix, so differences between layers could be harder to read.
Zero-filled areas may also appear as valid values, which could be interpreted as no demand rather than no data.
# 1) Convert zones to GeoDataFrame
SET_CRS = "EPSG:2263"
if isinstance(zones, gpd.GeoDataFrame):
zones_gdf = zones.copy()
if zones_gdf.crs is None:
zones_gdf = zones_gdf.set_crs(SET_CRS)
else:
geom_col = zones["geometry"]
geom = geom_col.map(wkt.loads) if isinstance(geom_col.iloc[0], str) else geom_col
zones_gdf = gpd.GeoDataFrame(
zones.drop(columns=["geometry"]).copy(),
geometry=gpd.GeoSeries(geom),
crs=SET_CRS,
)
zones_gdf = zones_gdf.to_crs(epsg=4326)
# 2) Aggregate rides + avg fare
agg = (
df.groupby("PULocationID", as_index=False)
.agg(rides=("PULocationID", "size"), avg_total_fare=("total_amount", "mean"))
.rename(columns={"PULocationID": "LocationID"})
)
g = zones_gdf.merge(agg, on="LocationID", how="left")
g["rides"] = g["rides"].fillna(0).astype(int)
g["avg_total_fare"] = g["avg_total_fare"].fillna(0).apply(lambda x: float(f"{x:.2g}"))
# 3) Folium map
center = [g.geometry.unary_union.centroid.y, g.geometry.unary_union.centroid.x]
m = folium.Map(location=center, zoom_start=11, tiles="cartodbpositron")
gj = g.to_json()
# Choropleth: Total rides (Blues)
folium.Choropleth(
geo_data=gj,
data=g[["LocationID", "rides"]],
columns=["LocationID", "rides"],
key_on="feature.properties.LocationID",
fill_color="Blues",
fill_opacity=0.7,
line_opacity=0.3,
nan_fill_opacity=0.0,
name="Total rides",
legend_name="Total rides per zone",
).add_to(m)
# Choropleth: Avg fare (Oranges)
folium.Choropleth(
geo_data=gj,
data=g[["LocationID", "avg_total_fare"]],
columns=["LocationID", "avg_total_fare"],
key_on="feature.properties.LocationID",
fill_color="Oranges",
fill_opacity=0.7,
line_opacity=0.3,
nan_fill_opacity=0.0,
name="Avg total fare",
legend_name="Average total fare ($)",
).add_to(m)
GeoJson(
gj,
control=False,
style_function=lambda x: {
"weight": 0.5,
"color": "#666",
"fillOpacity": 0,
},
highlight_function=lambda x: {
"weight": 3,
"color": "#000",
"fillOpacity": 0,
},
tooltip=GeoJsonTooltip(
fields=["LocationID", "zone", "borough", "rides", "avg_total_fare"],
aliases=["LocationID", "Zone", "Borough", "Total rides", "Avg total fare ($)"],
localize=True,
sticky=True,
permanent=False,
),
).add_to(m)
folium.LayerControl(collapsed=False).add_to(m)
m